1   package org.apache.solr.handler.clustering;
2   /*
3    * Licensed to the Apache Software Foundation (ASF) under one or more
4    * contributor license agreements.  See the NOTICE file distributed with
5    * this work for additional information regarding copyright ownership.
6    * The ASF licenses this file to You under the Apache License, Version 2.0
7    * (the "License"); you may not use this file except in compliance with
8    * the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  import java.io.File;
20  import java.util.Map;
21  
22  import org.apache.commons.io.FileUtils;
23  import org.apache.solr.SolrTestCaseJ4;
24  import org.apache.solr.common.SolrInputDocument;
25  import org.junit.BeforeClass;
26  
27  
28  /**
29   *
30   */
31  public abstract class AbstractClusteringTestCase extends SolrTestCaseJ4 {
32    protected static int numberOfDocs = 0;
33  
34    @BeforeClass
35    public static void beforeClass() throws Exception {
36      File testHome = createTempDir().toFile();
37      FileUtils.copyDirectory(getFile("clustering/solr"), testHome);
38      initCore("solrconfig.xml", "schema.xml", testHome.getAbsolutePath());
39      numberOfDocs = 0;
40      for (String[] doc : DOCUMENTS) {
41        assertNull(h.validateUpdate(adoc("id", Integer.toString(numberOfDocs), "url", doc[0], "title", doc[1], "snippet", doc[2])));
42        numberOfDocs++;
43      }
44      
45      // Add a multi-valued snippet
46      final SolrInputDocument multiValuedSnippet = new SolrInputDocument();
47      multiValuedSnippet.addField("id", numberOfDocs++);
48      multiValuedSnippet.addField("title", "Title");
49      multiValuedSnippet.addField("url", "URL");
50      multiValuedSnippet.addField("snippet", "First value of multi field. Some more text. And still more.");
51      multiValuedSnippet.addField("snippet", "Second value of multi field. Some more text. And still more.");
52      multiValuedSnippet.addField("snippet", "Third value of multi field. Some more text. And still more.");
53      assertNull(h.validateUpdate(adoc(multiValuedSnippet)));
54  
55      // Add a document with multi-field title and snippet
56      final SolrInputDocument multiFieldDoc = new SolrInputDocument();
57      multiFieldDoc.addField("id", numberOfDocs++);
58      multiFieldDoc.addField("title", "Title field");
59      multiFieldDoc.addField("heading", "Heading field");
60      multiFieldDoc.addField("url", "URL");
61      multiFieldDoc.addField("snippet", "Snippet field: this is the contents of the snippet field.");
62      multiFieldDoc.addField("body", "Body field: this is the contents of the body field that will get clustered together with snippet.");
63      assertNull(h.validateUpdate(adoc(multiFieldDoc)));
64      
65      // Add a document with one language supported by Carrot2
66      final SolrInputDocument docWithOneSupprtedLanguage = new SolrInputDocument();
67      docWithOneSupprtedLanguage.addField("id", numberOfDocs++);
68      docWithOneSupprtedLanguage.addField("title", "");
69      docWithOneSupprtedLanguage.addField("url", "one_supported_language");
70      docWithOneSupprtedLanguage.addField("lang", "zh-cn");
71      assertNull(h.validateUpdate(adoc(docWithOneSupprtedLanguage)));
72      
73      // Add a document with more languages, one supported by Carrot2
74      final SolrInputDocument docWithOneSupprtedLanguageOfMany = new SolrInputDocument();
75      docWithOneSupprtedLanguageOfMany.addField("id", numberOfDocs++);
76      docWithOneSupprtedLanguageOfMany.addField("url", "one_supported_language_of_many");
77      docWithOneSupprtedLanguageOfMany.addField("lang", "zh-tw");
78      docWithOneSupprtedLanguageOfMany.addField("lang", "POLISH");
79      docWithOneSupprtedLanguageOfMany.addField("lang", "de");
80      assertNull(h.validateUpdate(adoc(docWithOneSupprtedLanguageOfMany)));
81      
82      // Add a document with more languages, one supported by Carrot2
83      final SolrInputDocument docWithCustomFields = new SolrInputDocument();
84      docWithCustomFields.addField("id", numberOfDocs++);
85      docWithCustomFields.addField("url", "custom_fields");
86      docWithCustomFields.addField("intfield_i", 10);
87      docWithCustomFields.addField("floatfield_f", 10.5);
88      docWithCustomFields.addField("heading", "first");
89      docWithCustomFields.addField("heading", "second");
90      assertNull(h.validateUpdate(adoc(docWithCustomFields)));
91      assertNull(h.validateUpdate(commit()));
92    }
93  
94    /**
95     * Expose package-scope methods from {@link ClusteringComponent} to tests.
96     */
97    protected final Map<String,SearchClusteringEngine> getSearchClusteringEngines(ClusteringComponent comp) {
98      return comp.getSearchClusteringEngines();
99    }
100 
101   final static String[][] DOCUMENTS = new String[][]{
102           {"http://en.wikipedia.org/wiki/Data_mining",
103                   "Data Mining - Wikipedia",
104                   "Article about knowledge-discovery in databases (KDD), the practice of automatically searching large stores of data for patterns."},
105 
106 
107           {"http://en.wikipedia.org/wiki/Datamining",
108                   "Data mining - Wikipedia, the free encyclopedia",
109                   "Data mining is the entire process of applying computer-based methodology, ... Moreover, some data-mining systems such as neural networks are inherently geared ..."},
110 
111 
112           {"http://www.statsoft.com/textbook/stdatmin.html",
113                   "Electronic Statistics Textbook: Data Mining Techniques",
114                   "Outlines the crucial concepts in data mining, defines the data warehousing process, and offers examples of computational and graphical exploratory data analysis techniques."},
115 
116 
117           {"http://www.thearling.com/text/dmwhite/dmwhite.htm",
118                   "An Introduction to Data Mining",
119                   "Data mining, the extraction of hidden predictive information from large ... Data mining tools predict future trends and behaviors, allowing businesses to ..."},
120 
121 
122           {"http://www.anderson.ucla.edu/faculty/jason.frand/teacher/technologies/palace/datamining.htm",
123                   "Data Mining: What is Data Mining?",
124                   "Outlines what knowledge discovery, the process of analyzing data from different perspectives and summarizing it into useful information, can do and how it works."},
125 
126 
127           {"http://www.spss.com/datamine",
128                   "Data Mining Software, Data Mining Applications and Data Mining Solutions",
129                   "The patterns uncovered using data mining help organizations make better and ... data mining customer ... Data mining applications, on the other hand, embed ..."},
130 
131 
132           {"http://www.kdnuggets.com/",
133                   "KD Nuggets",
134                   "Newsletter on the data mining and knowledge industries, offering information on data mining, knowledge discovery, text mining, and web mining software, courses, jobs, publications, and meetings."},
135 
136 
137           {"http://www.answers.com/topic/data-mining",
138                   "data mining: Definition from Answers.com",
139                   "data mining n. The automatic extraction of useful, often previously unknown information from large databases or data ... Data Mining For Investing ..."},
140 
141 
142           {"http://www.statsoft.com/products/dataminer.htm",
143                   "STATISTICA Data Mining and Predictive Modeling Solutions",
144                   "GRC site-wide menuing system research and development. ... Contact a Data Mining Solutions Consultant. News and Success Stories. Events ..."},
145 
146 
147           {"http://datamining.typepad.com/",
148                   "Data Mining: Text Mining, Visualization and Social Media",
149                   "Commentary on text mining, data mining, social media and data visualization. ... While mining Twitter data for business and marketing intelligence (trend/buzz ..."},
150 
151 
152           {"http://www.twocrows.com/",
153                   "Two Crows Corporation",
154                   "Dedicated to the development, marketing, sales and support of tools for knowledge discovery to make data mining accessible and easy to use."},
155 
156 
157           {"http://www.thearling.com/",
158                   "Thearling.com",
159                   "Kurt Thearling's site dedicated to sharing information about data mining, the automated extraction of hidden predictive information from databases, and other analytic technologies."},
160 
161 
162           {"http://www.ccsu.edu/datamining/",
163                   "CCSU - Data Mining",
164                   "Offers degrees and certificates in data mining. Allows students to explore cutting-edge data mining techniques and applications: market basket analysis, decision trees, neural networks, machine learning, web mining, and data modeling."},
165 
166 
167           {"http://www.oracle.com/technology/products/bi/odm",
168                   "Oracle Data Mining",
169                   "Oracle Data Mining Product Center ... New Oracle Data Mining Powers New Social CRM Application (more information ... Mining High-Dimensional Data for ..."},
170 
171 
172           {"http://databases.about.com/od/datamining/a/datamining.htm",
173                   "Data Mining: An Introduction",
174                   "About.com article on how businesses are discovering new trends and patterns of behavior that previously went unnoticed through data mining, automated statistical analysis techniques."},
175 
176 
177           {"http://www.dmoz.org/Computers/Software/Databases/Data_Mining/",
178                   "Open Directory - Computers: Software: Databases: Data Mining",
179                   "Data Mining and Knowledge Discovery - A peer-reviewed journal publishing ... Data mining creates information assets that an organization can leverage to ..."},
180 
181 
182           {"http://www.cs.wisc.edu/dmi/",
183                   "DMI:Data Mining Institute",
184                   "Data Mining Institute at UW-Madison ... The Data Mining Institute (DMI) was started on June 1, 1999 at the Computer ... of the Data Mining Group of Microsoft ..."},
185 
186 
187           {"http://www.the-data-mine.com/",
188                   "The Data Mine",
189                   "Provides information about data mining also known as knowledge discovery in databases (KDD) or simply knowledge discovery. List software, events, organizations, and people working in data mining."},
190 
191 
192           {"http://www.statserv.com/datamining.html",
193                   "St@tServ - About Data Mining",
194                   "St@tServ Data Mining page ... Data mining in molecular biology, by Alvis Brazma. Graham Williams page. Knowledge Discovery and Data Mining Resources, ..."},
195 
196 
197           {"http://ocw.mit.edu/OcwWeb/Sloan-School-of-Management/15-062Data-MiningSpring2003/CourseHome/index.htm",
198                   "MIT OpenCourseWare | Sloan School of Management | 15.062 Data Mining ...",
199                   "Introduces students to a class of methods known as data mining that assists managers in recognizing patterns and making intelligent use of massive amounts of ..."},
200 
201 
202           {"http://www.pentaho.com/products/data_mining/",
203                   "Pentaho Commercial Open Source Business Intelligence: Data Mining",
204                   "For example, data mining can warn you there's a high probability a specific ... Pentaho Data Mining is differentiated by its open, standards-compliant nature, ..."},
205 
206 
207           {"http://www.investorhome.com/mining.htm",
208                   "Investor Home - Data Mining",
209                   "Data Mining or Data Snooping is the practice of searching for relationships and ... Data mining involves searching through databases for correlations and patterns ..."},
210 
211 
212           {"http://www.datamining.com/",
213                   "Predictive Modeling and Predictive Analytics Solutions | Enterprise ...",
214                   "Insightful Enterprise Miner - Enterprise data mining for predictive modeling and predictive analytics."},
215 
216 
217           {"http://www.sourcewatch.org/index.php?title=Data_mining",
218                   "Data mining - SourceWatch",
219                   "These agencies reported 199 data mining projects, of which 68 ... Office, \"DATA MINING. ... powerful technology known as data mining -- and how, in the ..."},
220 
221 
222           {"http://www.autonlab.org/tutorials/",
223                   "Statistical Data Mining Tutorials",
224                   "Includes a set of tutorials on many aspects of statistical data mining, including the foundations of probability, the foundations of statistical data analysis, and most of the classic machine learning and data mining algorithms."},
225 
226 
227           {"http://www.microstrategy.com/data-mining/index.asp",
228                   "Data Mining",
229                   "With MicroStrategy, data mining scoring is fully integrated into mainstream ... The integration of data mining models from other applications is accomplished by ..."},
230 
231 
232           {"http://www.datamininglab.com/",
233                   "Elder Research",
234                   "Provides consulting and short courses in data mining and pattern discovery patterns in data."},
235 
236 
237           {"http://www.sqlserverdatamining.com/",
238                   "SQL Server Data Mining > Home",
239                   "SQL Server Data Mining Portal ... Data Mining as an Application Platform (Whitepaper) Creating a Web Cross-sell Application with SQL Server 2005 Data Mining (Article) ..."},
240 
241 
242           {"http://databases.about.com/cs/datamining/g/dmining.htm",
243                   "Data Mining",
244                   "What is data mining? Find out here! ... Book Review: Data Mining and Statistical Analysis Using SQL. What is Data Mining, and What Does it Have to Do with ..."},
245 
246 
247           {"http://www.sas.com/technologies/analytics/datamining/index.html",
248                   "Data Mining Software and Text Mining | SAS",
249                   "... raw data to smarter ... Data Mining is an iterative process of creating ... The knowledge gleaned from data and text mining can be used to fuel ..."}
250   };
251 }